<<<<<<< HEAD ======= >>>>>>> 001d6a0d944c1bbe14ed89d19aa435141cdff71a Comparison over three datasets <<<<<<< HEAD ======= >>>>>>> 2ae1db8d314d278a935d777400b07ed94c0622a3

Result Comparison

2020 Presidential Election Results

vote_df = read_csv("./datasets/president_county_candidate.csv")
state_sum = read_csv("./datasets/president_state.csv")
region_df = 
    read_csv("./datasets/states.csv") %>% 
    rename(state = State)  
election_winner_df =
   read_csv("./datasets/president_county_candidate.csv") %>% 
    group_by(state, party) %>% 
    mutate(party_total = sum(total_votes)) %>% 
    ungroup() %>% 
    group_by(state) %>%
    mutate(state_winner = case_when(
        party_total == max(party_total) ~ TRUE,
        party_total != max(party_total) ~ FALSE),
        state_total = sum(total_votes)
    )

winner_region =
    left_join(election_winner_df, region_df) %>% 
    filter(state_winner == TRUE) %>% 
    select(state, candidate, state_total, Region) %>% 
    distinct()
election_map_df =
    election_winner_df %>% 
    filter(state_winner == TRUE) %>% 
    mutate(region = tolower(state)) %>% 
    select(state, candidate, party_total, state_total, region) %>% 
    distinct()

usa_map = map_data("state") 

usa_election_map = left_join(usa_map, election_map_df)
colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")

election_result_map =
ggplot(data = usa_election_map,
       aes(x = long, y = lat,
           group = group, fill = candidate, 
           text = paste("State: ", state , 
                 "</br></br>Candidate: ", candidate, 
                 "</br>Votes: ", party_total, 
                 "</br>Winning Proportion: ", round(party_total/state_total, 2)))) +
    geom_polygon(color = "gray90", size = 0.1) +
    labs(title = "Election Results across states") + 
    scale_fill_manual(values = colors) +
    theme_void() +
    theme(
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), 
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(), 
        legend.position = "bottom") 

ggplotly(election_result_map, tooltip = "text")
<<<<<<< HEAD
======= <<<<<<< HEAD
>>>>>>> 2ae1db8d314d278a935d777400b07ed94c0622a3

Tweets regarding 2020 Presidential Election

trump_df = 
  merge(
    read_csv("./datasets/trump1.csv"),
    read_csv("./datasets/trump2.csv"),
    all = TRUE
  ) %>%
  select(!X1) %>% 
  separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>% 
  separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>% 
  separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>% 
  separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>% 
  mutate(hashtag = "Trump")

biden_df = 
  merge(
    read_csv("./datasets/biden1.csv"),
    read_csv("./datasets/biden2.csv"),
    all = TRUE
  ) %>%
  select(!X1) %>%  
  separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>% 
  separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>% 
  separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>% 
  separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>% 
  mutate(hashtag = "Biden")
tweets_usa =
  merge(biden_df, trump_df, all = TRUE) %>% 
  filter(country == "United States of America")
usa_map <- map_data("state")

tweet_map <- tweets_usa %>%
group_by(state, hashtag) %>%
summarise(count = n(),
          likes = sum(likes)) %>%  
mutate (likes_tweets = likes*count,
        region = tolower(state)) %>%
select (region, hashtag, likes_tweets)  %>%
pivot_wider(names_from = "hashtag",
            values_from = "likes_tweets")  %>%
mutate(top = case_when(Biden>coalesce(Trump,0) ~ "Biden",
                       Trump>Biden ~ "Trump"))


states_tweet_map <- left_join(usa_map, tweet_map) %>% 
  mutate( top = recode( top, 
    "Trump" = "Donald Trump", 
    "Biden" = "Joe Biden"
  ))

colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")

tweet_result_map = 
  ggplot(data = states_tweet_map,
            aes(x = long, y = lat,
                group = group, fill = top, 
                text = paste("State: ", state , 
                 "</br></br>Candidate: ", top ))) +
    geom_polygon(color = "gray90", size = 0.1) +
    labs(title = "Tweets Results across states") + 
    scale_fill_manual(values = colors) +
    theme_void() +
    theme(
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), 
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(), 
        legend.position = "bottom") 
    

ggplotly(tweet_result_map, tooltip = "text")    
<<<<<<< HEAD
=======
=======
>>>>>>> 001d6a0d944c1bbe14ed89d19aa435141cdff71a >>>>>>> 2ae1db8d314d278a935d777400b07ed94c0622a3

Poll regarding 2020 Presidential Election

##Clean Polls and Regional datasets 

polls_df=
  read_csv("./datasets/presidential_state_toplines_2020.csv") %>%
  rename(date = modeldate) %>% 
  mutate(date = as.Date (date, format = "%m/%d/%Y"))%>% 
  separate(date, into = c("year", "month", "day")) %>%  
   select(state, month, day, candidate_chal,
          winstate_chal, voteshare_chal,
          candidate_inc, winstate_inc, voteshare_inc, 
          margin, state_turnout,
          -candidate_chal, -candidate_inc) %>% 
  rename(
    biden_winstate = winstate_chal, 
    biden_voteshare = voteshare_chal,
    trump_winstate = winstate_inc, 
    trump_voteshare = voteshare_inc, 
    voteshare_margin = margin,
    expvote_turnout = state_turnout) %>% 
  arrange(state, month, day)

region_df = 
    read_csv("./datasets/states.csv") %>% 
    rename(state = State)  

## Merge Polls and Regional Data 
polls_merge =
    merge(
    polls_df,
    region_df,
    by = "state") %>% 
  arrange(state, month, day) %>% 
  select(-`State Code`) %>% 
  relocate("state", "Region") 
## Clean vote proportion and number of votes for each candidate by state

exp_votes=
  polls_merge %>%
  select (-trump_winstate, -biden_winstate, -voteshare_margin, -Division) %>%
  filter (month == 11) %>% 
  rename (State = state) %>% 
  group_by(Region, State, month) %>%     
  summarize(
    voter_turnout = mean(expvote_turnout),
    prop_Biden = mean(biden_voteshare),
    prop_Trump = mean(trump_voteshare)
    ) %>% 
  pivot_longer(
      prop_Biden:prop_Trump,
      names_to = "Candidate", 
      names_prefix = "prop_",
      values_to = "votes_proportion") %>% 
  mutate(
    candidate_votes = (votes_proportion/100)*voter_turnout
    ) 
state_winner_df =
    exp_votes %>% 
    group_by(State) %>% 
    mutate(state_winner = case_when(
        candidate_votes == max(candidate_votes) ~ TRUE,
        candidate_votes != max(candidate_votes) ~ FALSE)
    ) %>% 
  mutate(region = tolower(State)) %>% 
  filter(state_winner == TRUE) %>% 
  select(-month) %>%
  distinct()

usa_map = map_data("state") 
 
us_election_map = 
  left_join(usa_map, state_winner_df) %>% 
  mutate( Candidate = recode(Candidate, 
    "Trump" = "Donald Trump", 
    "Biden" = "Joe Biden"
  ))
colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")

election_map=
  ggplot(data = us_election_map,
       aes(x = long, y = lat,
           group=group, fill = Candidate, 
           text = paste("State: ", State , 
                 "</br></br>Candidate: ", Candidate, 
                 "</br>Expected Votes Proportion: ", votes_proportion))) +
    geom_polygon(color = "gray90", size = 0.1) +
    labs(title = "Anticipated Election Results across states") + 
    scale_fill_manual(values = colors) +
    theme_void() +
    theme(
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), 
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(), 
        legend.position = "bottom") 

ggplotly(election_map , tooltip = "text")
<<<<<<< HEAD
======= <<<<<<< HEAD
=======

Tweets regarding 2020 Presidential Election

trump_df = 
  merge(
    read_csv("./datasets/trump1.csv"),
    read_csv("./datasets/trump2.csv"),
    all = TRUE
  ) %>%
  select(!X1) %>% 
  separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>% 
  separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>% 
  separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>% 
  separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>% 
  mutate(hashtag = "Trump")

biden_df = 
  merge(
    read_csv("./datasets/biden1.csv"),
    read_csv("./datasets/biden2.csv"),
    all = TRUE
  ) %>%
  select(!X1) %>%  
  separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>% 
  separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>% 
  separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>% 
  separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>% 
  mutate(hashtag = "Biden")
tweets_usa =
  merge(biden_df, trump_df, all = TRUE) %>% 
  filter(country == "United States of America")
usa_map <- map_data("state")

tweet_map <- tweets_usa %>%
group_by(state, hashtag) %>%
summarise(count = n(),
          likes = sum(likes)) %>%  
mutate (likes_tweets = likes*count,
        region = tolower(state)) %>%
select (region, hashtag, likes_tweets)  %>%
pivot_wider(names_from = "hashtag",
            values_from = "likes_tweets")  %>%
mutate(top = case_when(Biden>coalesce(Trump,0) ~ "Biden",
                       Trump>Biden ~ "Trump"))


states_tweet_map <- left_join(usa_map, tweet_map) %>% 
  mutate( top = recode( top, 
    "Trump" = "Donald Trump", 
    "Biden" = "Joe Biden"
  ))

colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")

tweet_result_map = 
  ggplot(data = states_tweet_map,
            aes(x = long, y = lat,
                group = group, fill = top, 
                text = paste("State: ", state , 
                 "</br></br>Candidate: ", top ))) +
    geom_polygon(color = "gray90", size = 0.1) +
    labs(title = "Tweets Results across states") + 
    scale_fill_manual(values = colors) +
    theme_void() +
    theme(
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), 
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(), 
        legend.position = "bottom") 
    

ggplotly(tweet_result_map, tooltip = "text")    
>>>>>>> 001d6a0d944c1bbe14ed89d19aa435141cdff71a >>>>>>> 2ae1db8d314d278a935d777400b07ed94c0622a3

Three different data comparison

us_election_df =
  us_election_map %>% 
  rename(state = State) 

comparison_df =  
  merge(usa_election_map, 
        states_tweet_map) %>%
  merge(us_election_df) %>% 
  select(state, voter_turnout, Candidate, 
         votes_proportion, votes_proportion, 
         party_total,top, candidate, state_total)

Difference between poll and election result

comparison_df %>% 
  select(state, voter_turnout, Candidate, 
         candidate, state_total) %>% 
  rename(poll_winner = Candidate, 
         election_winner = candidate) %>% 
  filter(poll_winner != election_winner) %>% 
  group_by(state) %>% 
  mutate(poll_vs_result = voter_turnout - state_total) %>% 
  select(-c(voter_turnout, state_total)) %>% 
  unique() %>% 
  knitr::kable()
state poll_winner election_winner poll_vs_result
North Carolina Joe Biden Donald Trump 195821.7
Florida Joe Biden Donald Trump 110595.3

From the computation, we can see that the poll mis-anticipated the winner of the election and the voter turnout 100-200 thousand in North Carolina and Florida respectively. Even though Donald Trump won these 2 states, we can see that the votes he got actually shrank compared to anticipation.

Difference between poll and election result in proportion

comparison_df %>% 
  select(state, votes_proportion, Candidate, 
         party_total,candidate, state_total) %>% 
  rename(poll_winner = Candidate, 
         election_winner = candidate) %>% 
  filter(poll_winner != election_winner) %>% 
  group_by(state) %>% 
  mutate(votes_proportion = round(votes_proportion, 2)) %>% 
  mutate(prop_vote = round(party_total/state_total, 4) * 100, 
         poll_vs_result =  votes_proportion - prop_vote) %>% 
  select(-c(party_total, state_total)) %>% 
  unique() %>% 
  knitr::kable()
state votes_proportion poll_winner election_winner prop_vote poll_vs_result
North Carolina 50.53 Joe Biden Donald Trump 49.93 0.6
Florida 50.82 Joe Biden Donald Trump 51.22 -0.4

In proportion, we can see that the poll anticipated that Joe Biden would win North Carolina with 50.53% votes, and Florida with 50.82 votes. In fact, however, Donald Trump won North Carolina with 49.93% votes and Florida with 51.22% votes.

Difference between Tweet and Election Result

comparison_df %>% 
  select(state,candidate, top) %>% 
  rename(election_winner = candidate, 
         top_tweet = top) %>% 
  filter(top_tweet != election_winner) %>% 
  group_by(state) %>% 
  unique() %>% 
  knitr::kable()
state election_winner top_tweet
Oklahoma Donald Trump Joe Biden
Texas Donald Trump Joe Biden
Nebraska Donald Trump Joe Biden
North Dakota Donald Trump Joe Biden
South Dakota Donald Trump Joe Biden
Montana Donald Trump Joe Biden
Utah Donald Trump Joe Biden
California Joe Biden Donald Trump
Oregon Joe Biden Donald Trump
Washington Joe Biden Donald Trump
Maine Joe Biden Donald Trump
Rhode Island Joe Biden Donald Trump
Vermont Joe Biden Donald Trump
Connecticut Joe Biden Donald Trump
Pennsylvania Joe Biden Donald Trump
Maryland Joe Biden Donald Trump
Virginia Joe Biden Donald Trump
District of Columbia Joe Biden Donald Trump
West Virginia Donald Trump Joe Biden
South Carolina Donald Trump Joe Biden
Ohio Donald Trump Joe Biden
Michigan Joe Biden Donald Trump
Indiana Donald Trump Joe Biden
Illinois Joe Biden Donald Trump
Mississippi Donald Trump Joe Biden
Louisiana Donald Trump Joe Biden
Iowa Donald Trump Joe Biden

We found there are 27 states with tweets that are different with the election result. Perhaps the tweets relating to specific candidate are not necessarily positive but negative. We can explore the tweets with the word cloud app we built and obtain more insights.

Difference among Tweet, Poll, and Election Result

comparison_df %>% 
  select(state, Candidate,candidate, top) %>% 
  rename(poll_winner = Candidate, 
         election_winner = candidate, 
         top_tweet = top) %>% 
  filter(poll_winner != election_winner |
           top_tweet != election_winner |
           poll_winner != top_tweet) %>% 
  group_by(state) %>% 
  unique() %>% 
  knitr::kable()
state poll_winner election_winner top_tweet
Oklahoma Donald Trump Donald Trump Joe Biden
Texas Donald Trump Donald Trump Joe Biden
Nebraska Donald Trump Donald Trump Joe Biden
North Dakota Donald Trump Donald Trump Joe Biden
South Dakota Donald Trump Donald Trump Joe Biden
Montana Donald Trump Donald Trump Joe Biden
Utah Donald Trump Donald Trump Joe Biden
California Joe Biden Joe Biden Donald Trump
Oregon Joe Biden Joe Biden Donald Trump
Washington Joe Biden Joe Biden Donald Trump
Maine Joe Biden Joe Biden Donald Trump
Rhode Island Joe Biden Joe Biden Donald Trump
Vermont Joe Biden Joe Biden Donald Trump
Connecticut Joe Biden Joe Biden Donald Trump
Pennsylvania Joe Biden Joe Biden Donald Trump
Maryland Joe Biden Joe Biden Donald Trump
Virginia Joe Biden Joe Biden Donald Trump
North Carolina Joe Biden Donald Trump Donald Trump
District of Columbia Joe Biden Joe Biden Donald Trump
West Virginia Donald Trump Donald Trump Joe Biden
South Carolina Donald Trump Donald Trump Joe Biden
Florida Joe Biden Donald Trump Donald Trump
Ohio Donald Trump Donald Trump Joe Biden
Michigan Joe Biden Joe Biden Donald Trump
Indiana Donald Trump Donald Trump Joe Biden
Illinois Joe Biden Joe Biden Donald Trump
Mississippi Donald Trump Donald Trump Joe Biden
Louisiana Donald Trump Donald Trump Joe Biden
Iowa Donald Trump Donald Trump Joe Biden

To sum up the different among three datasets, poll still performed better than the popular tweets analysis. Maybe it is because we haven’t learned how to let the machine distinguish the meaning of words. Once the machine can distinguish the meaning of the words, it perhaps can provide more insights into the election.